{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmark concepts\n",
    "\n",
    "This notebook goes through all concepts and runs the query using EXPLAIN ANALYZE. This is useful for informing how long the queries will take and for benchmarking performance improvements."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import psycopg2\n",
    "import getpass\n",
    "\n",
    "from collections import OrderedDict\n",
    "\n",
    "# database config\n",
    "sqluser=getpass.getuser()\n",
    "# keep sqlpass blank if using peer authentication\n",
    "sqlpass=''\n",
    "# database\n",
    "sqldb='mimic'\n",
    "sqlschema='public,mimiciii'\n",
    "\n",
    "query_schema = 'set search_path to ' + sqlschema + ';'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Connected!\n"
     ]
    }
   ],
   "source": [
    "if (not sqlpass) & (sqlpass != ''):\n",
    "    con = psycopg2.connect(user=sqluser, password=sqlpass, database=sqldb)\n",
    "else:\n",
    "    con = psycopg2.connect(user=sqluser, database=sqldb)\n",
    "print('Connected!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# function to read a single script\n",
    "def read_script(base_path, script_name):\n",
    "    query = ''\n",
    "    with open(os.path.join(base_path,script_name)) as f:\n",
    "        for line in f.readlines():\n",
    "            line = line.lstrip(' ').rstrip(' ')\n",
    "            if len(line)<1:\n",
    "                continue\n",
    "            elif len(line)<2:\n",
    "                query += line\n",
    "            else:\n",
    "                # ignore comments\n",
    "                if '--' in line:\n",
    "                    line = line[0:line.index('--')]\n",
    "                query += line\n",
    "    # replace double newlines with single newline\n",
    "    query = query.replace('\\n\\n','\\n')\n",
    "    return query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def extract_drop_line(query):\n",
    "    # hack out the drop materialized view/drop table statement\n",
    "    query_drop = []\n",
    "    if 'drop materialized view ' in query.lower():\n",
    "        query_drop.extend(re.findall('drop materialized view [A-z0-9_ ]+;\\n',query,re.I))\n",
    "    if 'drop table ' in query.lower():\n",
    "        query_drop.extend(re.findall('drop table [A-z0-9_ ]+;\\n',query,re.I))\n",
    "\n",
    "    if not query_drop:\n",
    "        query_drop = ''\n",
    "    elif len(query_drop)==1:\n",
    "        query = query.replace(query_drop[0], '')\n",
    "        query = [query]\n",
    "    else:\n",
    "        # have multiple drop/create statements\n",
    "        query_parts = list() #query.split(query_drop[1])[0]\n",
    "\n",
    "        for i, q in enumerate(query_drop):\n",
    "            # get first part of query\n",
    "            query_split = query.split(q)\n",
    "            query_parts.append(query_split[0])\n",
    "            query = query_split[1]\n",
    "\n",
    "        # now append the final table created in the full query\n",
    "        query_parts.append(query)\n",
    "        # remove the first element\n",
    "        query_parts = query_parts[1:]\n",
    "        \n",
    "        query = query_parts\n",
    "    \n",
    "    return query, query_drop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "# benchmark query\n",
    "def benchmark_query(con, query, query_schema=query_schema, query_drop=query_drop, parallel_workers=None):\n",
    "    cur = con.cursor()\n",
    "    cur.execute(query_schema)\n",
    "    if parallel_workers:\n",
    "        cur.execute('SET max_parallel_workers_per_gather TO {};'.format(parallel_workers))\n",
    "    else:\n",
    "        cur.execute('SET max_parallel_workers_per_gather TO DEFAULT;')\n",
    "    cur.execute(query_drop)\n",
    "    cur.execute('explain analyze ' + query)\n",
    "    result = cur.fetchall()\n",
    "    cur.execute('commit;')\n",
    "    cur.close()\n",
    "\n",
    "    query_plan = [item[0] for item in result]\n",
    "    time = float(query_plan[-1].replace('Execution time: ', '').replace(' ms', ''))\n",
    "    \n",
    "    return time, query_plan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "demographics/icustay_detail.sql...\n",
      "\n",
      "   0.7s\n"
     ]
    }
   ],
   "source": [
    "# example on a single concept\n",
    "base_path = '/home/alistairewj/git/mimic-code/concepts'\n",
    "script_name = 'demographics/icustay_detail.sql'\n",
    "\n",
    "print(script_name, end='...')\n",
    "\n",
    "# read the script's query\n",
    "query = read_script(base_path, script_name)\n",
    "# returns a list of queries/drop statements\n",
    "query, query_drop = extract_drop_line(query)\n",
    "\n",
    "if len(query)==1:\n",
    "    # most of the time each script only creates a single view/table\n",
    "    query = query[0]\n",
    "    query_drop = query_drop[0]\n",
    "    time, query_plan = benchmark_query(con, query, query_schema=query_schema, query_drop=query_drop)\n",
    "    print('{:6.1f}s'.format(time/1000))\n",
    "else:\n",
    "    print('')\n",
    "    for i in range(len(query)):\n",
    "        time, query_plan = benchmark_query(con, query[i], query_schema=query_schema, query_drop=query_drop[i])\n",
    "        print('  part {} - {:6.1f}s'.format(i, time/1000))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmark all concepts in make-concepts.sql\n",
    "\n",
    "Uses parallel querying available in PostgreSQL 10."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "code-status.sql                         ...   51.4s\n",
      "echo-data.sql                           ...   24.8s\n",
      "durations/ventilation-durations.sql     ... \n",
      "  part 0...                    43.7s\n",
      "\n",
      "  part 1...                    22.0s\n",
      "durations/crrt-durations.sql            ...   14.3s\n",
      "durations/adenosine-durations.sql       ...    0.0s\n",
      "durations/dobutamine-durations.sql      ...    0.5s\n",
      "durations/dopamine-durations.sql        ...   14.6s\n",
      "durations/epinephrine-durations.sql     ...    6.9s\n",
      "durations/isuprel-durations.sql         ...    0.0s\n",
      "durations/milrinone-durations.sql       ...    6.1s\n",
      "durations/norepinephrine-durations.sql  ...   16.5s\n",
      "durations/phenylephrine-durations.sql   ...   13.3s\n",
      "durations/vasopressin-durations.sql     ...    2.5s\n",
      "durations/vasopressor-durations.sql     ...   23.0s\n",
      "durations/weight-durations.sql          ...    9.6s\n",
      "comorbidity/elixhauser-ahrq-v37-with-drg.sql...   12.9s\n",
      "comorbidity/elixhauser-ahrq-v37-no-drg.sql...   11.6s\n",
      "comorbidity/elixhauser-ahrq-v37-no-drg-all-icd.sql...   12.5s\n",
      "comorbidity/elixhauser-quan.sql         ...    8.6s\n",
      "comorbidity/elixhauser-score-ahrq.sql   ...    0.2s\n",
      "comorbidity/elixhauser-score-quan.sql   ...    0.1s\n",
      "demographics/HeightWeightQuery.sql      ...   14.7s\n",
      "demographics/icustay_detail.sql         ...    0.7s\n",
      "firstday/blood-gas-first-day.sql        ...   32.7s\n",
      "firstday/blood-gas-first-day-arterial.sql...   36.7s\n",
      "firstday/gcs-first-day.sql              ...    7.9s\n",
      "firstday/height-first-day.sql           ...    4.0s\n",
      "firstday/labs-first-day.sql             ...   55.0s\n",
      "firstday/rrt-first-day.sql              ...   16.0s\n",
      "firstday/urine-output-first-day.sql     ...    3.2s\n",
      "firstday/ventilation-first-day.sql      ...    0.2s\n",
      "firstday/vitals-first-day.sql           ...  129.1s\n",
      "firstday/weight-first-day.sql           ...    1.3s\n",
      "fluid-balance/urine-output.sql          ...   18.2s\n",
      "sepsis/angus.sql                        ...    2.8s\n",
      "sepsis/martin.sql                       ...    2.3s\n",
      "sepsis/explicit.sql                     ...    0.4s\n",
      "organfailure/kdigo-creatinine.sql       ...   30.8s\n",
      "organfailure/kdigo-uo.sql               ...  194.7s\n",
      "organfailure/kdigo-stages-7day.sql      ...   10.9s\n",
      "organfailure/kdigo-stages-48hr.sql      ...    6.2s\n",
      "organfailure/meld.sql                   ...    0.7s\n",
      "severityscores/oasis.sql                ...    1.5s\n",
      "severityscores/sofa.sql                 ...   35.8s\n",
      "severityscores/saps.sql                 ...    4.7s\n",
      "severityscores/sapsii.sql               ...    7.8s\n",
      "severityscores/apsiii.sql               ...    3.6s\n",
      "severityscores/lods.sql                 ...    3.7s\n",
      "severityscores/sirs.sql                 ...    0.5s\n"
     ]
    }
   ],
   "source": [
    "query_plans = OrderedDict()\n",
    "query_times = OrderedDict()\n",
    "\n",
    "base_path = '/home/alistairewj/git/mimic-code/concepts'\n",
    "\n",
    "# read through all make concepts\n",
    "with open(os.path.join(base_path,'make-concepts.sql')) as fp:\n",
    "    for line in fp.readlines():\n",
    "        if len(line)<2:\n",
    "            continue\n",
    "        elif line[0:2] != '\\\\i':\n",
    "            continue\n",
    "        elif 'ccs_diagnosis_table.sql' in line:\n",
    "            continue\n",
    "            \n",
    "        # get the name of the script\n",
    "        script_name = line[3:].rstrip('\\n')\n",
    "        print('{:40s}'.format(script_name), end='... ')\n",
    "        \n",
    "        # read the script's query\n",
    "        query = read_script(base_path, script_name)\n",
    "        query, query_drop = extract_drop_line(query)\n",
    "\n",
    "\n",
    "        if len(query)==1:\n",
    "            # most of the time each script only creates a single view/table\n",
    "            q = query[0]\n",
    "            qd = query_drop[0]\n",
    "            time, query_plan = benchmark_query(con, q, query_schema=query_schema, query_drop=qd)\n",
    "            print('{:6.1f}s'.format(time/1000))\n",
    "        else:\n",
    "            query_plans[script_name] = list()\n",
    "            query_times[script_name] = list()\n",
    "            for i in range(len(query)):\n",
    "                time, query_plan = benchmark_query(con, query[i], query_schema=query_schema, query_drop=query_drop[i])\n",
    "                print('')\n",
    "                print('  part {}...{:18s}{:6.1f}s'.format(i, '', time/1000))\n",
    "\n",
    "                query_plans[script_name].append(query_plan)\n",
    "                query_times[script_name].append(time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmark all concepts in make-concepts.sql\n",
    "\n",
    "Does *not* use parallel querying."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "code-status.sql                         ...   50.3s\n",
      "echo-data.sql                           ...   25.2s\n",
      "durations/ventilation-durations.sql     ... \n",
      "  part 0...                    44.1s\n",
      "\n",
      "  part 1...                    17.6s\n",
      "durations/crrt-durations.sql            ...   14.5s\n",
      "durations/adenosine-durations.sql       ...    0.0s\n",
      "durations/dobutamine-durations.sql      ...    0.5s\n",
      "durations/dopamine-durations.sql        ...   14.6s\n",
      "durations/epinephrine-durations.sql     ...    7.0s\n",
      "durations/isuprel-durations.sql         ...    0.0s\n",
      "durations/milrinone-durations.sql       ...    6.0s\n",
      "durations/norepinephrine-durations.sql  ...   16.5s\n",
      "durations/phenylephrine-durations.sql   ...   13.2s\n",
      "durations/vasopressin-durations.sql     ...    2.4s\n",
      "durations/vasopressor-durations.sql     ...   21.5s\n",
      "durations/weight-durations.sql          ...    9.5s\n",
      "comorbidity/elixhauser-ahrq-v37-with-drg.sql...   12.4s\n",
      "comorbidity/elixhauser-ahrq-v37-no-drg.sql...   11.3s\n",
      "comorbidity/elixhauser-ahrq-v37-no-drg-all-icd.sql...   12.7s\n",
      "comorbidity/elixhauser-quan.sql         ...    8.6s\n",
      "comorbidity/elixhauser-score-ahrq.sql   ...    0.2s\n",
      "comorbidity/elixhauser-score-quan.sql   ...    0.2s\n",
      "demographics/HeightWeightQuery.sql      ...   14.4s\n",
      "demographics/icustay_detail.sql         ...    0.7s\n",
      "firstday/blood-gas-first-day.sql        ...   32.5s\n",
      "firstday/blood-gas-first-day-arterial.sql...   32.5s\n",
      "firstday/gcs-first-day.sql              ...    8.0s\n",
      "firstday/height-first-day.sql           ...    4.1s\n",
      "firstday/labs-first-day.sql             ...   56.3s\n",
      "firstday/rrt-first-day.sql              ...   15.9s\n",
      "firstday/urine-output-first-day.sql     ...    3.3s\n",
      "firstday/ventilation-first-day.sql      ...    0.1s\n",
      "firstday/vitals-first-day.sql           ...  123.3s\n",
      "firstday/weight-first-day.sql           ...    1.3s\n",
      "fluid-balance/urine-output.sql          ...   16.2s\n",
      "sepsis/angus.sql                        ...    2.7s\n",
      "sepsis/martin.sql                       ...    2.3s\n",
      "sepsis/explicit.sql                     ...    0.3s\n",
      "organfailure/kdigo-creatinine.sql       ...   33.9s\n",
      "organfailure/kdigo-uo.sql               ...  181.3s\n",
      "organfailure/kdigo-stages-7day.sql      ...   10.4s\n",
      "organfailure/kdigo-stages-48hr.sql      ...    6.1s\n",
      "organfailure/meld.sql                   ...    0.6s\n",
      "severityscores/oasis.sql                ...    1.5s\n",
      "severityscores/sofa.sql                 ...   34.9s\n",
      "severityscores/saps.sql                 ...    4.7s\n",
      "severityscores/sapsii.sql               ...    7.7s\n",
      "severityscores/apsiii.sql               ...    3.5s\n",
      "severityscores/lods.sql                 ...    3.6s\n",
      "severityscores/sirs.sql                 ...    0.9s\n"
     ]
    }
   ],
   "source": [
    "# same thing, but test parallel\n",
    "query_plans_single_core = OrderedDict()\n",
    "query_times_single_core = OrderedDict()\n",
    "parallel_workers = 0\n",
    "\n",
    "base_path = '/home/alistairewj/git/mimic-code/concepts'\n",
    "\n",
    "# read through all make concepts\n",
    "with open(os.path.join(base_path,'make-concepts.sql')) as fp:\n",
    "    for line in fp.readlines():\n",
    "        if len(line)<2:\n",
    "            continue\n",
    "        elif line[0:2] != '\\\\i':\n",
    "            continue\n",
    "        elif 'ccs_diagnosis_table.sql' in line:\n",
    "            continue\n",
    "            \n",
    "        # get the name of the script\n",
    "        script_name = line[3:].rstrip('\\n')\n",
    "        print('{:40s}'.format(script_name), end='... ')\n",
    "        \n",
    "        # read the script's query\n",
    "        query = read_script(base_path, script_name)\n",
    "        query, query_drop = extract_drop_line(query)\n",
    "\n",
    "\n",
    "        if len(query)==1:\n",
    "            # most of the time each script only creates a single view/table\n",
    "            q = query[0]\n",
    "            qd = query_drop[0]\n",
    "            time, query_plan = benchmark_query(con, q, query_schema=query_schema, query_drop=qd,\n",
    "                                               parallel_workers=0)\n",
    "            print('{:6.1f}s'.format(time/1000))\n",
    "            query_plans_single_core[script_name] = query_plan\n",
    "            query_times_single_core[script_name] = time\n",
    "        else:\n",
    "            query_plans_single_core[script_name] = list()\n",
    "            query_times_single_core[script_name] = list()\n",
    "            print('')\n",
    "            for i in range(len(query)):\n",
    "                time, query_plan = benchmark_query(con, query[i],\n",
    "                                                   query_schema=query_schema, query_drop=query_drop[i],\n",
    "                                                   parallel_workers=0)\n",
    "                print('  part {}...{:18s}{:6.1f}s'.format(i, '', time/1000))\n",
    "\n",
    "                query_plans_single_core[script_name].append(query_plan)\n",
    "                query_times_single_core[script_name].append(time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compare parallel with no parallel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# first print all queries which used a parallel plan\n",
    "for q in query_plans:\n",
    "    for i, l in enumerate(query_plans[q]):\n",
    "        if 'Parallel' in l:\n",
    "            print(q)\n",
    "            break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If the above prints nothing, no queries are using a parallel plan! :("
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
